Take home Exercise 3 - 4 Feb 2022

Animation of graphs in R

Frostbear https://sg.linkedin.com/in/farahfoo (SMU Masters in IT business (Fintech and Analytics))https://scis.smu.edu.sg/master-it-business
2022-02-04

Context of Exercise

Using previous age-sex pyramid based on 2021 data, to apply appropriate interactivity and animation methods to design an age-sex pyramid based data visualisation to show the changes of demographic structure of Singapore by age cohort and gender between 2000-2020 at planning area level. The data set used is entitle Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2000-2010 and Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2011-2020, from Department of Statistics home page.

Installing and loading packages required for Age-Sex pyramid

packages = c('tidyverse', 'readxl', 'ggthemes')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Loading data using _csv command

pop_data <- read_csv("data/respopagesextod2021.csv")
glimpse (data)
function (..., list = character(), package = NULL, lib.loc = NULL, 
    verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)  

Adding Factor levels to AG field

To sort the age-sex pyramid using Age Group, we need to classify the AG field as Factor

pop_data$AG <- factor(pop_data$AG, levels = unique(pop_data$AG))

Summarising the data into the required buckets

summary_sex <- pop_data %>%
  group_by(AG, Sex) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex,5)
# A tibble: 5 x 3
  AG       Sex        Pop
  <fct>    <chr>    <dbl>
1 0_to_4   Females  87730
2 0_to_4   Males    91400
3 5_to_9   Females  97120
4 5_to_9   Males   102390
5 10_to_14 Females  97980

Plotting double geom_bar Age-sex pyramid

ggplot(summary_sex, aes(x=AG)) +
  geom_bar(data=summary_sex[summary_sex$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  geom_bar(data=summary_sex[summary_sex$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  geom_hline(yintercept=0, colour="white", lwd=1)+
coord_flip () +
scale_y_continuous(breaks = seq(-160000,160000,40000), labels = function(v) ifelse(abs(v)>=1000,paste0(abs(v)/1000, "K"), abs(v))) +
  labs(y="Population", x="Gender") +
  ggtitle("                        Male                                                Female")

Building base graph for 20 years of population data

For animation of population across time, data source can be found here at singstat website.

year2000 <- read_csv("data/respopagesextod2000to2010.csv")
year2011 <- read_csv("data/respopagesextod2011to2020.csv") 

head (year2000,3)
# A tibble: 3 x 7
  PA         SZ        AG     Sex   TOD                      Pop  Time
  <chr>      <chr>     <chr>  <chr> <chr>                  <dbl> <dbl>
1 Ang Mo Kio Cheng San 0_to_4 Males HDB 1- and 2-Room Fla~    20  2000
2 Ang Mo Kio Cheng San 0_to_4 Males HDB 3-Room Flats         480  2000
3 Ang Mo Kio Cheng San 0_to_4 Males HDB 4-Room Flats         220  2000
head (year2011,3)
# A tibble: 3 x 7
  PA         SZ                     AG     Sex   TOD         Pop  Time
  <chr>      <chr>                  <chr>  <chr> <chr>     <dbl> <dbl>
1 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 1- a~     0  2011
2 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 3-Ro~    10  2011
3 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 4-Ro~    30  2011
# Since columns are the same, we can combine the 2 files into 1 file for processing

combined <- rbind(year2000,year2011)
unique(combined$Time)
 [1] 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
[14] 2013 2014 2015 2016 2017 2018 2019 2020
# write_csv(combined, "combined.csv")

# in the Time column, there are only numbers, hence the row header was not copied into the data

Adding Factor levels to Age group field

To sort the age-sex pyramid using Age Group, we need to classify the AG field as Factor

combined$AG <- factor(combined$AG, levels = unique(combined$AG))

Summarising data by Age Group, Sex and Time

To plot the graph over the different years, we need to call out the Time field as a column (variable)

summary_sex_20 <- combined %>%
  group_by(AG, Sex, Time) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex_20,5)
# A tibble: 5 x 4
  AG     Sex      Time    Pop
  <fct>  <chr>   <dbl>  <dbl>
1 0_to_4 Females  2000 108850
2 0_to_4 Females  2001 107510
3 0_to_4 Females  2002 105310
4 0_to_4 Females  2003 101430
5 0_to_4 Females  2004  99290

Plotting double geom_bar Age-sex pyramid for 20 years

Using the individual Age-sex pyramid from above (plotted for year 2021), we re-use the code to plot out 20 pyramid graphs, 1 graph for each year.

ggplot(summary_sex_20, aes(x=AG)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  geom_hline(yintercept=0, colour="white", lwd=1)+
  
coord_flip () +
  
scale_y_continuous(breaks = seq(-160000,160000,40000), labels = function(v) ifelse(abs(v)>=1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Age-Sex Population Pyramid, Singapore 2021", 
   caption = 'Data Source: Department of Statistics (June 2021)',
   y = "Population", x = "Gender") + 
  
  theme_bw() +
   theme(legend.position = "none")+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=12))+
  
facet_wrap(. ~ `Time`,ncol=4)

It is clear from the 20 graphs displayed, that the difference in population year on year is not clear. To show more clarity, we use the year as base to transition the graph in 1 frame in the next section.

Using gganimate

but first, we enhance the graph by

adding title caption theme find out the maximum and minimum values of the population to set the chart axis to ensure all the values will be captured properly.

Activating gganimate as it will be used for the animation of the age-sex pyramid over the 20 years

We call out the package required which is ggaminate.

packages = c('gganimate')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Then we find out the max and min values of the population set.

max(summary_sex_20$Pop)
[1] 164060
min(summary_sex_20$Pop)
[1] 1380

Improving the existing code by adding the range limits, title, subtitle and theme.

SG20 <- ggplot(summary_sex_20, aes(x=AG,colour=Sex,fill=Sex)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity") +
  geom_hline(yintercept=0, colour="white", lwd=1) +
  
coord_flip() +
  
scale_y_continuous(limits = c(-170000, 170000), n.breaks = 10, labels = function(v) ifelse(abs(v)>= 1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Singapore Age-Sex Population Pyramid for 20 years",
    subtitle = 'Year: "{round(frame_time, 0)}"',
    caption = 'Data Source: Department of Statistics (June 2000 to June 2020)',
  y = 'Male and Female Population',
  x = 'Age Group') +
  
  theme_bw () +
   theme(legend.text = element_text(size=12))+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=10))

SG20

Animating the age-sex pyramid

SG20 +
transition_time(Time) +
ease_aes('linear')

Doing Interactive plots

Interactive plots help to us compare the same point across 2 graphs

Loading packages for interactive plots

packages = c('tidyverse', 'readxl', 'ggthemes', 'ggiraph', 'plotly', 
             'gganimate', 'patchwork', 'DT', 'gifski', 'gapminder')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Creating new data set for interactive plots

interactive_data <- combined %>%
  spread (Sex, sum(Pop)) %>% 
  mutate(Total = Females + Males) %>% 
  group_by(Time, AG, PA) %>% 
  filter(PA == c("Ang Mo Kio", "Marine Parade")) %>% 
  summarise(Female = sum(Females), 
            Male =sum(Males), 
            Total = sum(Total)) 


interactive_data$AG <- factor(interactive_data$AG, levels = unique(interactive_data$AG))

interactive_data <- interactive_data %>% 
  mutate(Female_prop = Female / Total*100) %>% 
    mutate(Male_prop = Male / Total*100) 

head (interactive_data,5)
# A tibble: 5 x 8
# Groups:   Time, AG [3]
   Time AG       PA           Female  Male Total Female_prop Male_prop
  <dbl> <fct>    <chr>         <dbl> <dbl> <dbl>       <dbl>     <dbl>
1  2000 0_to_4   Ang Mo Kio     3020  3230  6250        48.3      51.7
2  2000 0_to_4   Marine Para~    280   280   560        50        50  
3  2000 5_to_9   Ang Mo Kio     4010  4030  8040        49.9      50.1
4  2000 5_to_9   Marine Para~    320   270   590        54.2      45.8
5  2000 10_to_14 Ang Mo Kio     3780  3830  7610        49.7      50.3
d <- highlight_key(interactive_data)
p1 <- ggplot (data = d,
        aes(x = Time, 
            y = Total)) + 
  geom_col() +
  labs(title = 'Population across time in 2 Planning areas') +
facet_wrap(. ~ `PA`)
  
  p1

p2 <- ggplot (data = d,
        aes(x =AG,
            y = Total)) + 
  geom_point() +
  labs(title = 'Total population across Age group in the Planning areas for 20 years')+
facet_wrap(. ~ `PA`) +
coord_flip()
  

p2

Putting 2 graphs side by side

Try clicking on 1 graph to see where the point is on the next graph

subplot (ggplotly (p1),
         ggplotly (p2))

Linking graph with data table using crosstalk

Try clicking on the table to see where is the point in the graph

Agegroup_PA <- combined %>% 
  group_by(AG, PA) %>% 
  summarise (Pop = sum (Pop))

d <- highlight_key(Agegroup_PA)

p3 <- ggplot (data = d, aes(x = AG, y = Pop))+ 
  geom_col () +
  ggtitle ("Age groups in Singapore's Planning Area",
subtitle = 'Planning Area: {PA}') +
  coord_flip()
gg <- highlight(ggplotly(p3),
                "plotly_selected")

crosstalk::bscols(gg,
                  DT::datatable(d),
                  widths = 20)

Animating graph by Singapore Planning Area and age group`

Plotting the graph to show the number of population across the age group by Planning area for 20 years (2000 - 2020).

animate1 <- ggplot (Agegroup_PA, aes(x = AG, y = Pop/1000))+ 
    geom_col () +
  coord_flip() +
ggtitle('Planning area: {closest_state}') +
  labs (x = 'AG',
        y = 'Population (thousand)') +
transition_states (PA) +
  ease_aes('linear') +
  enter_fade() +
  exit_fade()

animate(animate1,fps=2)